############################################
# Markowitz Arctic project
# difference-in-differences regression
# Miriam Barnum
# December 6, 2022
############################################

library(tidyverse)
library(countrycode)
library(lubridate)
library(lmtest)
library(sandwich)

##################################################################################################
## Event counts (Table 3)
##################################################################################################
events <- read.csv("ArcticDataset_Master_MilitaryActivty12-02-2022.csv",
                   na.strings = c(".", "", "NA"))

#therese's code
shock <- ymd("2007-06-30")

# Coding rule: if we don't have a start date, we take the end date to distinguish pre/post shock
events_cleaned <- events %>%
  dplyr::select(-ccode) %>% #Removing variables to recode them later
  mutate(year = replace(year, eventid == 285 & countryname == "Norway", 2010), # Correcting coding mistake
         ccode = countrycode(countryname, "country.name", "cown"), # Adding correct COW codes
         start_date = as.character(start_date),
         end_date = as.character(end_date),
         start_date = mdy(start_date),
         end_date = replace(end_date, end_date == "22/06/09", "06/22/09"), #Correcting end date
         end_date = mdy(end_date)) %>%
  mutate(preshock = ifelse(start_date < shock, 1, 0), #Creating pre-shock, post-shock dummy
         #preshock = replace(preshock, year >= 2008, 0),
         #preshock = replace(preshock, year < 2007, 1),
         #preshock = replace(preshock, eventid == 144, 1),
         #preshock = replace(preshock, eventid == 145, 0),
         dispute = rowSums(.[c("areadisp", "borders", "airspace", "areaclaimed")], na.rm = T),
         dispute = replace(dispute, dispute > 0, 1)) %>%  #Creating a dummy for disputs (not within borders, not power projection)
  filter(!is.na(beybord), #Dropping one observation that does not have a coding for beybord
         !is.na(preshock)) %>% #Dropping one observation that does not have a coding for preshock
  mutate(type = "withinborders",
         type = replace(type, (beybord == 1 & dispute == 0), "powerprojection"),
         type = replace(type, (beybord == 0 & dispute == 1), NA),
         type = replace(type, (beybord == 1 & dispute == 1), "disputedarea")) %>%
  mutate(type_detailed = type,
         type_detailed = replace(type_detailed, type == "powerprojection" & Intercept == 1, "intercept"),
         type_detailed = replace(type_detailed, type == "powerprojection" & Research == 1, "research"),
         type_detailed = replace(type_detailed, type == "disputedarea" & Intercept == 1, "intercept"),
         type_detailed = replace(type_detailed, type == "disputedarea" & Research == 1, "research")) %>%
  mutate(type_binary = ifelse(beybord == 1, "Beyond borders", "Within borders"))

## Recoding factor levels
# Type variable
table(events_cleaned$type)
events_cleaned$type <- factor(events_cleaned$type,
                              levels = c("withinborders",
                                         "powerprojection",
                                         "disputedarea"),
                              labels = c("Within borders",
                                         "Power projection",
                                         "To disputed area/other state's borders"))

# Type detailed variable
events_cleaned$type_detailed <- factor(events_cleaned$type_detailed,
                                       levels = c("withinborders",
                                                  "intercept",
                                                  "research",
                                                  "powerprojection",
                                                  "disputedarea"),
                                       labels = c("Within borders",
                                                  "Intercept",
                                                  "Research",
                                                  "Other power projection",
                                                  "Other to disputed area/other state's borders"))

# Country variable
events_cleaned$countryname <- factor(events_cleaned$countryname,
                                     levels = c("USA",
                                                "Canada",
                                                "Denmark",
                                                "Norway",
                                                "Russia"),
                                     labels = c("United States",
                                                "Canada",
                                                "Denmark",
                                                "Norway",
                                                "Russia"))

# Pre-post shock
events_cleaned$preshock <- factor(events_cleaned$preshock,
                                  levels = c(1, 0),
                                  labels = c("Pre shock", "Post shock"))

# end therese's code

# aggregate event counts by quarter

base <- data.frame(countryname = rep(c("United States",
                                       "Canada",
                                       "Denmark",
                                       "Norway",
                                       "Russia"), 24),
                   iso = rep(c("USA","CAN","DNK","NOR","RUS"), 24),
                   year = rep(2005:2010, each = 20),
                   q = rep(rep(1:4, each = 5), 6))

events_quarterly <- events_cleaned %>%
  mutate(month = month(start_date)) %>%
  mutate(q = ifelse(month <= 3, 1, ifelse(month <= 6, 2, ifelse(month <= 9, 3, 4)))) %>%
  group_by(year, q, countryname) %>%
  summarise(count = n()) %>%
  right_join(base) %>%
  mutate(count = replace(count, is.na(count), 0)) %>%
  unite(quarter, year, q, sep = "-Q", remove = F) 

# add gdp, milex, postshock, and land_oriented variables

gdp <- read.csv("DP_LIVE_09012020234322113.csv") %>%
  pivot_wider(id_cols = c("LOCATION", "TIME"), names_from = MEASURE, values_from = Value) %>%
  rename(gdp_pct = PC_CHGPP, gdp_vol = IDX)


cinc <- readRDS("cincdat.rds") %>%
  select(country, year, milex)
cinc$country[cinc$country == "United States of America"] <- "United States"
cinc$country[cinc$country == "Russia (Soviet Union)"] <- "Russia"

library(readxl)
polity <- read_excel("p4v2017.xls") %>%
  select(country, year, polity2)

events_quarterly <- events_quarterly %>%
  filter(year < 2010) %>% # so that we have the same amount of time before and after the shock
  mutate(land_oriented = as.numeric(countryname %in% c("Russia","Norway"))) %>%
  mutate(postshock = as.numeric(quarter > "2007-Q2")) %>%
  left_join(cinc, by = c("countryname" = "country", "year")) %>%
  mutate(milex = log(milex)) %>%
  left_join(gdp, by = c("iso" = "LOCATION", "quarter" = "TIME")) %>%
  left_join(polity, by = c("countryname" = "country", "year"))

###########
# t-tests
###########

# comparing land oriented countries before and after the shock
# p-value = 0.05951
# mean of x mean of y 
# 4.35      1.60 
t.test(events_quarterly$count[events_quarterly$land_oriented & events_quarterly$postshock],
       events_quarterly$count[events_quarterly$land_oriented & !events_quarterly$postshock])

# comparing production oriented countries before and after the shock
# p-value = 0.6474
# mean of x mean of y 
# 2.133333  1.833333 
t.test(events_quarterly$count[!events_quarterly$land_oriented & events_quarterly$postshock],
       events_quarterly$count[!events_quarterly$land_oriented & !events_quarterly$postshock])

# land oriented vs. production oriented after the shock
# p-value = 0.1311
# mean of x mean of y 
# 4.350000  2.133333 
t.test(events_quarterly$count[events_quarterly$land_oriented & events_quarterly$postshock],
       events_quarterly$count[!events_quarterly$land_oriented & events_quarterly$postshock])

# land oriented vs. production oriented before the shock
# p-value = 0.6889
# mean of x mean of y 
# 1.600000  1.833333
t.test(events_quarterly$count[events_quarterly$land_oriented & !events_quarterly$postshock],
       events_quarterly$count[!events_quarterly$land_oriented & !events_quarterly$postshock])

##############
# diff-in-diff
##############

events_quarterly$did <- events_quarterly$land_oriented * events_quarterly$postshock

didreg <- lm(count ~ did + land_oriented + postshock, data = events_quarterly)
r2 <- signif(summary(didreg)$r.squared, 2)
d <- coeftest(didreg, vcov = vcovHC(didreg, cluster = "countryname", type="HC1"))

did_control = lm(count ~ did + land_oriented + postshock + gdp_pct + milex, data = events_quarterly)
r2c <- signif(summary(did_control)$r.squared, 2)
dc <- coeftest(did_control, vcov = vcovHC(did_control, cluster = "countryname", type="HC1"))

did_control2 = lm(count ~ did + land_oriented + postshock + gdp_vol + milex, data = events_quarterly)
r2c2 <- signif(summary(did_control2)$r.squared, 2)
dc2 <- coeftest(did_control2, vcov = vcovHC(did_control2, cluster = "countryname", type="HC1"))

did_control3 = lm(count ~ did + land_oriented + postshock + I(log(gdp_vol)) + milex, data = events_quarterly)
r2c3 <- signif(summary(did_control3)$r.squared, 2)
dc3 <- coeftest(did_control3, vcov = vcovHC(did_control3, cluster = "countryname", type="HC1"))

did_control4 = lm(count ~ did + land_oriented + postshock + I(log(gdp_vol)) + gdp_pct + milex, data = events_quarterly)
r2c4 <- signif(summary(did_control4)$r.squared, 2)
dc4 <- coeftest(did_control4, vcov = vcovHC(did_control4, cluster = "countryname", type="HC1"))

# did_control4 = lm(count ~ did + land_oriented + postshock + I(log(gdp_vol)) + milex + polity2, data = events_quarterly)
# summary(did_control4) 

library(stargazer)
covars = c("Difference in differences", "Resource-dependent", "Post-shock","Quarterly GDP", 
           "ln(Quarterly GDP)", "Quarterly GDP (% change)", "ln(Military expenditure)")

#  Table 3 (paper) 
stargazer(d, dc2, dc3, dc, dc4, 
          dep.var.labels = "Events per Quarter", covariate.labels = covars, 
          add.lines = list(c("Observations", nobs(didreg), nobs(did_control2), nobs(did_control3), nobs(did_control), nobs(did_control4)), 
                           c("$R2$", r2, r2c2, r2c3, r2c, r2c4)),
          type = "html", out = "arctic_did_cluster.html",
          notes = c("Results from OLS regressions with quarterly count of Arctic military activity events as the outcome. Units are country-quarters, spanning the years 2005 to 2009. Robust standard  errors clustered by country in parentheses. Russia and Norway are coded as resource-dependent. The post-shock period begins in June 2007."))

#stargazer(didreg, did_control2, did_control3, did_control, #did_control4, 
#          dep.var.labels = "Events per Quarter", covariate.labels = covars, type = "html", out = "arctic_did.html")


##################################################################################################
## Icebreakers (Table A4)
##################################################################################################

ice <- read.csv("Icebreakers w Country Breakdown 05-31-18 LL.csv",
                na.strings = c("NA", ""),
                stringsAsFactors = F,
                fileEncoding="latin1")

# therese's code
shock <- ymd("2007-06-30")
year_20year_min <- ymd("1997-01-01")
year_5year_min <- ymd("2005-01-01")
year_20year_max <- ymd("2017-12-31")
year_5year_max <- ymd("2009-12-31")

date_names <- c("Date.Announced",
                "Budget.Allocated",
                "Construction.Started",
                "Date.of.Commission",
                "Date.Removed.from.Service",
                "Date.Returned.to.Service",
                "Date.of.Decommission")

ice_new <- ice %>%
  filter(!is.na(Country)) %>%
  mutate(Country = str_replace_all(Country, " $", ""),
         Status.December.31.2017 = str_replace_all(Status.December.31.2017, " $", ""),
         Status.Pre.Shock = str_replace_all(Status.Pre.Shock, " $", "")) %>%
  dplyr::select(Name,
                Country,
                tonnage = Displacement..tonnes.,
                one_of(date_names),
                Status.pre.1997,
                Status.Pre.Shock,
                Status.December.31.2017) %>%
  
  mutate(tonnage = str_replace_all(tonnage, ",", ""),
         tonnage = as.numeric(tonnage)) %>%
  
  # Recoding variables
  mutate(Status.Pre.Shock = replace(Status.Pre.Shock, str_detect(Status.Pre.Shock, "built Preshock"), "Built Preshock")) %>%
  
  # creating unique ship indicators
  mutate(ship = seq(1,nrow(.))) %>%
  
  # correcting dates
  mutate(Date.Announced = dmy(Date.Announced),
         Budget.Allocated = dmy(Budget.Allocated),
         Construction.Started = dmy(Construction.Started),
         Date.of.Commission = dmy(Date.of.Commission),
         Date.Removed.from.Service = mdy(Date.Removed.from.Service),
         Date.Returned.to.Service = mdy(Date.Returned.to.Service),
         Date.of.Decommission = dmy(Date.of.Decommission))


ice_new20 <- ice_new %>%
  
  dplyr::select(country = Country,
                everything()) %>%
  
  # Gathering columns
  gather(status, date, Date.Announced:Date.of.Decommission) %>%
  
  filter(!is.na(date)) %>%
  filter(date < year_20year_max) %>%
  
  # creating time slots
  mutate(shock = ifelse(date < shock, "preshock", "postshock")) %>%
  filter(!is.na(shock)) %>%
  
  
  # Latest dates
  group_by(ship, shock) %>%
  slice(which.max(date)) %>%
  mutate(new = paste(status, date, sep = ";")) %>%
  dplyr::select(-status, - date) %>%
  spread(shock, new) %>%
  separate(preshock, c("status_preshock", "date_preshock"), sep = ";") %>%
  separate(postshock, c("status_postshock", "date_postshock"), sep = ";") %>%
  
  # recoding preshock events
  mutate(pre_new = ifelse(status_preshock %in% c("Date.Announced", 
                                                 "Budget.Allocated",
                                                 "Construction.Started"), NA, status_preshock),
         pre_new = replace(pre_new, status_preshock == "Removed From Service", NA),
         pre_new = replace(pre_new, status_preshock == "Date.of.Commission" & date_preshock < mdy("1/1/1997"), "Built Pre-1997"),
         pre_new = replace(pre_new, status_preshock == "Date.of.Commission" & date_preshock > mdy("1/1/1997"), "Built Preshock"),
         pre_new = replace(pre_new, status_preshock == c("Date.Removed.from.Service"), "Removed From Service")) %>%
  
  mutate(post_new = ifelse(status_postshock %in% c("Date.Announced", 
                                                   "Budget.Allocated",
                                                   "Construction.Started"), NA, status_postshock),
         post_new = replace(post_new, status_postshock == "Date.of.Commission", "Built Postshock"),
         post_new = replace(post_new, status_postshock %in% c("Date.of.Decommission"), "Decommissioned"),
         post_new = replace(post_new, status_postshock %in% c("Date.Returned.to.Service"), "Returned to Service"),
         post_new = replace(post_new, !(status_postshock %in% c("Date.Announced", 
                                                                "Budget.Allocated",
                                                                "Construction.Started",
                                                                "Date.of.Commission",
                                                                "Date.of.Decommission",
                                                                "Date.Returned.to.Service")), pre_new)) %>%
  
  # turning to long format
  gather(indicator, value, pre_new:post_new) %>%
  
  mutate(newinvestment = ifelse(indicator == "pre_new" & value == "Built Preshock", 1, 0),
         newinvestment = replace(newinvestment, indicator == "post_new" & value %in% c("Built Postshock"), 1),
         count = ifelse(indicator == "pre_new" & value %in% c("Built Pre-1997", "Built Preshock"), 1, 0),
         count = replace(count, indicator == "post_new" & value %in% c("Built Pre-1997", 
                                                                       "Built Preshock", 
                                                                       "Built Postshock",
                                                                       "Returned to Service"), 1))

ice_new20$indicator <- factor(ice_new20$indicator,
                              levels = c("pre_new", "post_new"),
                              labels = c("Pre shock", "Post shock"))

ice_new20$country <- factor(ice_new20$country, 
                            levels = c("United States",
                                       "Canada",
                                       "Denmark",
                                       "Norway",
                                       "Russia"))


ice_new20$countryname_gdp <- factor(ice_new20$country,
                                    levels = c("United States",
                                               "Canada",
                                               "Denmark",
                                               "Norway",
                                               "Russia"),
                                    labels = c("United States\n(GDP 15.1 trillion USD)",
                                               "Canada\n(GDP 1.6 trillion USD)",
                                               "Denmark\n(GDP 0.3 trillion USD)",
                                               "Norway\n(GDP 0.4 trillion USD)",
                                               "Russia\n(GDP 1.5 trillion USD)"))

## New investments
ice_new20_newinvestment <- ice_new20 %>%
  filter(newinvestment == 1)

# end therese's code

#ice_new20_newinvestment$date2[ice_new20_newinvestment$indicator == "Pre shock"] <- ymd(ice_new20_newinvestment$date_preshock[ice_new20_newinvestment$indicator == "Pre shock"])
  #ifelse(ice_new20_newinvestment$indicator == "Pre shock", as.Date(ice_new20_newinvestment$date_preshock, "%Y-%m-%d"), as.Date(ice_new20_newinvestment$date_postshock, "%Y-%m-%d"))
#ice_new20_newinvestment$year[ice_new20_newinvestment$indicator == "Pre shock"] <- year(ice_new20_newinvestment$date_preshock[ice_new20_newinvestment$indicator == "Pre shock"])

# aggregate event counts by quarter

base <- data.frame(country = rep(c("United States",
                                       "Canada",
                                       "Denmark",
                                       "Norway",
                                       "Russia"), 84),
                   iso = rep(c("USA","CAN","DNK","NOR","RUS"), 84),
                   year = rep(1997:2017, each = 20),
                   q = rep(rep(1:4, each = 5), 21))

ice_quarterly <- ice_new20_newinvestment %>%
  mutate(year = ifelse(indicator == "Pre shock", year(date_preshock), year(date_postshock)),
         month = ifelse(indicator == "Pre shock", month(date_preshock), month(date_postshock))) %>%
  mutate(q = ifelse(month <= 3, 1, ifelse(month <= 6, 2, ifelse(month <= 9, 3, 4)))) %>%
  group_by(year, q, country) %>%
  summarise(count_ice = n(),
            tonnage = sum(tonnage)) %>%
  right_join(base) %>%
  mutate(count_ice = replace(count_ice, is.na(count_ice), 0),
         tonnage = replace(tonnage, is.na(tonnage), 0)) %>%
  unite(quarter, year, q, sep = "-Q", remove = F) 

# add gdp, milex, polity, postshock, and land_oriented variables

ice_quarterly <- ice_quarterly %>%
    #filter(year < 2010) %>% # so that we have the same amount of time before and after the shock
    mutate(land_oriented = as.numeric(country %in% c("Russia","Norway"))) %>%
    mutate(postshock = as.numeric(quarter > "2007-Q2")) %>%
    left_join(cinc, by = c("country", "year")) %>%
    mutate(milex = log(milex)) %>%
    left_join(gdp, by = c("iso" = "LOCATION", "quarter" = "TIME")) %>%
    left_join(polity)

###########
# t-tests
###########

# comparing land oriented countries before and after the shock
# p-value = 0.2243
# mean of x    mean of y 
# 0.14285714     0.07142857 
t.test(ice_quarterly$count_ice[ice_quarterly$land_oriented & ice_quarterly$postshock],
       ice_quarterly$count_ice[ice_quarterly$land_oriented & !ice_quarterly$postshock])

# comparing production oriented countries before and after the shock
# p-value = 0.3156
# mean of x   mean of y 
# 0.023809524 0.007936508 
t.test(ice_quarterly$count_ice[!ice_quarterly$land_oriented & ice_quarterly$postshock],
       ice_quarterly$count_ice[!ice_quarterly$land_oriented & !ice_quarterly$postshock])

# land oriented vs. production oriented after the shock
# p-value = 0.027
# mean of x  mean of y 
# 0.14285714 0.02380952
t.test(ice_quarterly$count_ice[ice_quarterly$land_oriented & ice_quarterly$postshock],
       ice_quarterly$count_ice[!ice_quarterly$land_oriented & ice_quarterly$postshock])

# land oriented vs. production oriented before the shock
# p-value = 0.03307
# mean of x   mean of y 
# 0.071428571  0.007936508  
t.test(ice_quarterly$count_ice[ice_quarterly$land_oriented & !ice_quarterly$postshock],
       ice_quarterly$count_ice[!ice_quarterly$land_oriented & !ice_quarterly$postshock])

##############
# diff-in-diff
##############
ice_quarterly$did <- ice_quarterly$land_oriented * ice_quarterly$postshock

didreg <- lm(count_ice ~ did + land_oriented + postshock, data = ice_quarterly)
r2 <- signif(summary(didreg)$r.squared, 2)
d <- coeftest(didreg, vcov = vcovHC(didreg, cluster = "countryname", type="HC1"))

did_control = lm(count_ice ~ did + land_oriented + postshock + gdp_pct + milex, data = ice_quarterly)
r2c <- signif(summary(did_control)$r.squared, 2)
dc <- coeftest(did_control, vcov = vcovHC(did_control, cluster = "countryname", type="HC1"))

did_control2 = lm(count_ice ~ did + land_oriented + postshock + gdp_vol + milex, data = ice_quarterly)
r2c2 <- signif(summary(did_control2)$r.squared, 2)
dc2 <- coeftest(did_control2, vcov = vcovHC(did_control2, cluster = "countryname", type="HC1"))

did_control3 = lm(count_ice ~ did + land_oriented + postshock + I(log(gdp_vol)) + milex, data = ice_quarterly)
r2c3 <- signif(summary(did_control3)$r.squared, 2)
dc3 <- coeftest(did_control3, vcov = vcovHC(did_control3, cluster = "countryname", type="HC1"))

# did_control4 = lm(count_ice ~ did + land_oriented + postshock + I(log(gdp_vol)) + milex + polity2, data = ice_quarterly)
# summary(did_control4) 

did_control4 = lm(count_ice ~ did + land_oriented + postshock + I(log(gdp_vol)) + gdp_pct + milex, data = ice_quarterly)
r2c4 <- signif(summary(did_control4)$r.squared, 2)
dc4 <- coeftest(did_control4, vcov = vcovHC(did_control4, cluster = "countryname", type="HC1"))

# Table A4 
stargazer(d, dc2, dc3, dc, dc4, 
          dep.var.labels = "New Icebreakers", covariate.labels = covars, 
          add.lines = list(c("Observations", nobs(didreg), nobs(did_control2), nobs(did_control3), nobs(did_control), nobs(did_control4)), 
                           c("$R2$", r2, r2c2, r2c3, r2c, r2c4)),
          type = "html", out = "icebreakers_did.html",
          notes = c("Results from OLS regressions with quarterly count of new icebreakers as the outcome. Units are country-quarters, spanning the years 1997 to 2017. Robust standard  errors clustered by country in parentheses. Russia and Norway are coded as resource-dependent. The post-shock period begins in June 2007. Models 2-5 drop pre-2002 observations for Russia due to missing quarterly GDP data."))

##############
# tonnage (Table A5)
##############

# comparing land oriented countries before and after the shock
# p-value = 0.5043
# mean of x mean of y 
# 743.7738  448.5357
t.test(ice_quarterly$tonnage[ice_quarterly$land_oriented & ice_quarterly$postshock],
       ice_quarterly$tonnage[ice_quarterly$land_oriented & !ice_quarterly$postshock])

# comparing production oriented countries before and after the shock
# p-value = 0.4979
# mean of x mean of y 
# 40.95238 132.24603 
t.test(ice_quarterly$tonnage[!ice_quarterly$land_oriented & ice_quarterly$postshock],
       ice_quarterly$tonnage[!ice_quarterly$land_oriented & !ice_quarterly$postshock])

# land oriented vs. production oriented after the shock
# p-value = 0.02461
# mean of x mean of y 
# 743.77381  40.95238 
t.test(ice_quarterly$tonnage[ice_quarterly$land_oriented & ice_quarterly$postshock],
       ice_quarterly$tonnage[!ice_quarterly$land_oriented & ice_quarterly$postshock])

# land oriented vs. production oriented before the shock
# p-value = 0.36
# mean of x mean of y 
# 448.5357  132.2460  
t.test(ice_quarterly$tonnage[ice_quarterly$land_oriented & !ice_quarterly$postshock],
       ice_quarterly$tonnage[!ice_quarterly$land_oriented & !ice_quarterly$postshock])

didreg <- lm(tonnage ~ did + land_oriented + postshock, data = ice_quarterly)
r2 <- signif(summary(didreg)$r.squared, 2)
d <- coeftest(didreg, vcov = vcovHC(didreg, cluster = "countryname", type="HC1"))

did_control = lm(tonnage ~ did + land_oriented + postshock + gdp_pct + milex, data = ice_quarterly)
r2c <- signif(summary(did_control)$r.squared, 2)
dc <- coeftest(did_control, vcov = vcovHC(did_control, cluster = "countryname", type="HC1"))

did_control2 = lm(tonnage ~ did + land_oriented + postshock + gdp_vol + milex, data = ice_quarterly)
r2c2 <- signif(summary(did_control2)$r.squared, 2)
dc2 <- coeftest(did_control2, vcov = vcovHC(did_control2, cluster = "countryname", type="HC1"))

did_control3 = lm(tonnage ~ did + land_oriented + postshock + I(log(gdp_vol)) + milex, data = ice_quarterly)
r2c3 <- signif(summary(did_control3)$r.squared, 2)
dc3 <- coeftest(did_control3, vcov = vcovHC(did_control3, cluster = "countryname", type="HC1"))

# did_control4 = lm(tonnage ~ did + land_oriented + postshock + I(log(gdp_vol)) + milex + polity2, data = ice_quarterly)
# summary(did_control4) 

did_control4 = lm(tonnage ~ did + land_oriented + postshock + I(log(gdp_vol)) + gdp_pct + milex, data = ice_quarterly)
r2c4 <- signif(summary(did_control4)$r.squared, 2)
dc4 <- coeftest(did_control4, vcov = vcovHC(did_control4, cluster = "countryname", type="HC1"))

# Table A5
stargazer(d, dc2, dc3, dc, dc4, 
          dep.var.labels = "Icebreaker Tonnage", covariate.labels = covars, 
          add.lines = list(c("Observations", nobs(didreg), nobs(did_control2), nobs(did_control3), nobs(did_control), nobs(did_control4)), 
                           c("$R2$", r2, r2c2, r2c3, r2c, r2c4)),
          type = "html", out = "tonnage_did.html",
          notes = c("Results from OLS regressions with new icebreaker tonnage as the outcome. Units are country-quarters, spanning the years 1997 to 2017. Robust standard  errors clustered by country in parentheses. Russia and Norway are coded as resource-dependent. The post-shock period begins in June 2007. Models 2-5 drop pre-2002 observations for Russia due to missing quarterly GDP data."))


###############################
# Bases (Table A3)
###############################
bases <- read.csv("Arctic Bases 06-04-18 LL.csv",
                  na.strings = c("NA", ""),
                  stringsAsFactors = F)

bases_sub <- bases %>%
  filter(!is.na(Country)) %>%
  dplyr::select(Country, 
                Name,
                Announcement.Date,
                Initial.Date.of.Instatement,
                Date.of.Decommission,
                Date.of.Reinstatement,
                Date.of.Upgrades,
                Date.of.Upgrades.2,
                Status.Pre.1997,
                Status.Pre.Shock, 
                Status.Post.Shock) %>%
  
  mutate(Announcement.Date = mdy(Announcement.Date),
         Initial.Date.of.Instatement = mdy(Initial.Date.of.Instatement),
         Date.of.Decommission = mdy(Date.of.Decommission),
         Date.of.Reinstatement = mdy(Date.of.Reinstatement),
         Date.of.Upgrades = mdy(Date.of.Upgrades),
         Date.of.Upgrades.2 = mdy(Date.of.Upgrades.2)) %>%
  
  mutate(base = seq(1, nrow(.)))

bases_sub_long <- bases_sub %>%
  
  gather(status, date, Announcement.Date:Date.of.Upgrades.2) %>%
  
  filter(!is.na(date)) %>%
  filter(date < year_20year_max) %>%
  
  # creating time slots
  mutate(shock = ifelse(date < shock, "preshock", "postshock")) %>%
  filter(!is.na(shock)) %>%
  
  # Latest dates
  group_by(base, shock) %>%
  slice(which.max(date)) %>%
  ungroup() %>%
  mutate(new = paste(status, date, sep = ";")) %>%
  dplyr::select(-status, - date) %>%
  spread(shock, new) %>%
  separate(preshock, c("status_preshock", "date_preshock"), sep = ";") %>%
  separate(postshock, c("status_postshock", "date_postshock"), sep = ";") %>%
  
  # Coding new categories
  mutate(pre_new = ifelse(status_preshock %in% c("Initial.Date.of.Instatement") & date_preshock > year_20year_min, "Built Pre-Shock", NA),
         pre_new = replace(pre_new, status_preshock %in% c("Initial.Date.of.Instatement") & date_preshock < year_20year_min, "Built Pre-1997"),
         pre_new = replace(pre_new, status_preshock %in% c("Date.of.Reinstatement",
                                                           "Date.of.Upgrades",
                                                           "Date.of.Upgrades.2") & date_preshock > year_20year_min, "Reopened/upgraded pre-shock"),
         pre_new = replace(pre_new, status_preshock %in% c("Date.of.Reinstatement",
                                                           "Date.of.Upgrades",
                                                           "Date.of.Upgrades.2") & date_preshock < year_20year_min, "Reopened/upgraded pre-1997"),
         pre_new = replace(pre_new, status_preshock %in% c("Date.of.Decommission"), "Closed")) %>%
  
  mutate(post_new = ifelse(is.na(status_postshock), pre_new, NA),
         post_new = replace(post_new, status_postshock %in% c("Initial.Date.of.Instatement"), "Built Post-Shock"),
         post_new = replace(post_new, status_postshock %in% c("Date.of.Reinstatement", 
                                                              "Date.of.Upgrades", 
                                                              "Date.of.Upgrades.2"), "Upgraded"),
         post_new = replace(post_new, status_postshock %in% c("Date.of.Decommission"), "Closed")) %>%
  
  gather(indicator, value, pre_new:post_new) %>%
  
  mutate(newinvestment = ifelse(indicator == "pre_new" & value %in% c("Built Pre-Shock",
                                                                      "Reopened/upgraded pre-shock"), 1, 0),
         newinvestment = replace(newinvestment, indicator == "post_new" & value %in% c("Built Post-Shock",
                                                                                       "Upgraded"), 1)) %>%
  
  mutate(count = ifelse(indicator == "pre_new" & value %in% c("Built Pre-Shock",
                                                              "Built Pre-1997",
                                                              "Reopened/upgraded pre-shock",
                                                              "Reopened/upgraded pre-1997"), 1, 0),
         count = replace(count, indicator == "post_new" & value %in% c("Built Post-Shock",
                                                                       "Upgraded",
                                                                       "Built Pre-Shock",
                                                                       "Built Pre-1997",
                                                                       "Reopened/upgraded pre-shock",
                                                                       "Reopened/upgraded pre-1997"), 1)) %>%
  
  filter(value != "Closed") 

bases_sub_long$Country <- factor(bases_sub_long$Country,
                                 levels = c("United States",
                                            "Canada",
                                            "Denmark",
                                            "Norway",
                                            "Russia"))

bases_sub_long$countryname_gdp <- factor(bases_sub_long$Country,
                                         levels = c("United States",
                                                    "Canada",
                                                    "Denmark",
                                                    "Norway",
                                                    "Russia"),
                                         labels = c("United States\n(GDP 15.1 trillion USD)",
                                                    "Canada\n(GDP 1.6 trillion USD)",
                                                    "Denmark\n(GDP 0.3 trillion USD)",
                                                    "Norway\n(GDP 0.4 trillion USD)",
                                                    "Russia\n(GDP 1.5 trillion USD)"))


table(bases_sub_long$value)
bases_sub_long$value <- factor(bases_sub_long$value,
                               levels = c("Built Pre-1997",
                                          "Reopened/upgraded pre-1997",
                                          "Reopened/upgraded pre-shock",
                                          "Upgraded",
                                          "Built Post-Shock"),
                               labels = c("Built Pre-1997",
                                          "Reopened/upgraded pre-1997",
                                          "Reopened/upgraded pre-shock",
                                          "Upgraded/reopened",
                                          "New base"))

levels(bases_sub_long$value) <- list(`Built Pre-1997` = c("Built Pre-1997"), 
                                     `Upgraded/reopened` = c("Reopened/upgraded pre-1997",
                                                             "Reopened/upgraded pre-shock",
                                                             "Upgraded/reopened"),
                                     `New base` = c("New base"))

table(bases_sub_long$indicator)
bases_sub_long$indicator <- factor(bases_sub_long$indicator,
                                   levels = c("pre_new", "post_new"),
                                   labels= c("Pre shock", "Post shock"))



## New investments
bases_newinvestment <- bases_sub_long %>%
  filter(newinvestment == 1)

# end therese's code

# aggregate event counts by quarter, merge with ice_quarterly dataset

bases_quarterly <- bases_newinvestment %>%
  mutate(year = ifelse(indicator == "Pre shock", year(date_preshock), year(date_postshock)),
         month = ifelse(indicator == "Pre shock", month(date_preshock), month(date_postshock))) %>%
  mutate(q = ifelse(month <= 3, 1, ifelse(month <= 6, 2, ifelse(month <= 9, 3, 4)))) %>%
  group_by(year, q, Country) %>%
  summarise(count_bases = n(), new_bases = sum(value == "New base")) %>%
  right_join(ice_quarterly, by = c("Country" = "country", "year", "q")) %>%
  mutate(count_bases = replace(count_bases, is.na(count_bases), 0),
         new_bases = replace(new_bases, is.na(new_bases), 0)) %>%
  unite(quarter, year, q, sep = "-Q", remove = F) 

###########
# t-tests
###########

# comparing land oriented countries before and after the shock
# p-value = 8.725e-05
# mean of x    mean of y 
# 0.2738095     0.0000000 
t.test(bases_quarterly$count_bases[bases_quarterly$land_oriented & bases_quarterly$postshock],
       bases_quarterly$count_bases[bases_quarterly$land_oriented & !bases_quarterly$postshock])

# comparing production oriented countries before and after the shock
# p-value = 0.3156
# mean of x   mean of y 
# 0.023809524 0.007936508 
t.test(bases_quarterly$count_bases[!bases_quarterly$land_oriented & bases_quarterly$postshock],
       bases_quarterly$count_bases[!bases_quarterly$land_oriented & !bases_quarterly$postshock])

# land oriented vs. production oriented after the shock
# p-value = 0.0003833
# mean of x  mean of y 
# 0.27380952 0.02380952
t.test(bases_quarterly$count_bases[bases_quarterly$land_oriented & bases_quarterly$postshock],
       bases_quarterly$count_bases[!bases_quarterly$land_oriented & bases_quarterly$postshock])

# land oriented vs. production oriented before the shock
# p-value = 0.3192
# mean of x   mean of y 
# 0.000000000  0.007936508  
t.test(bases_quarterly$count_bases[bases_quarterly$land_oriented & !bases_quarterly$postshock],
       bases_quarterly$count_bases[!bases_quarterly$land_oriented & !bases_quarterly$postshock])

###########
didreg <- lm(count_bases ~ did + land_oriented + postshock, data = bases_quarterly)
r2 <- signif(summary(didreg)$r.squared, 2)
d <- coeftest(didreg, vcov = vcovHC(didreg, cluster = "countryname", type="HC1"))

did_control = lm(count_bases ~ did + land_oriented + postshock + gdp_pct + milex, data = bases_quarterly)
r2c <- signif(summary(did_control)$r.squared, 2)
dc <- coeftest(did_control, vcov = vcovHC(did_control, cluster = "countryname", type="HC1"))

did_control2 = lm(count_bases ~ did + land_oriented + postshock + gdp_vol + milex, data = bases_quarterly)
r2c2 <- signif(summary(did_control2)$r.squared, 2)
dc2 <- coeftest(did_control2, vcov = vcovHC(did_control2, cluster = "countryname", type="HC1"))

did_control3 = lm(count_bases ~ did + land_oriented + postshock + I(log(gdp_vol)) + milex, data = bases_quarterly)
r2c3 <- signif(summary(did_control3)$r.squared, 2)
dc3 <- coeftest(did_control3, vcov = vcovHC(did_control3, cluster = "countryname", type="HC1"))

# did_control4 = lm(count_bases ~ did + land_oriented + postshock + I(log(gdp_vol)) + milex + polity2, data = bases_quarterly)
# summary(did_control4) 

did_control4 = lm(count_bases ~ did + land_oriented + postshock + I(log(gdp_vol)) + gdp_pct + milex, data = bases_quarterly)
r2c4 <- signif(summary(did_control4)$r.squared, 2)
dc4 <- coeftest(did_control4, vcov = vcovHC(did_control4, cluster = "countryname", type="HC1"))

#Table A3
stargazer(d, dc2, dc3, dc, dc4, 
          dep.var.labels = "Bases New Investment", covariate.labels = covars, 
          add.lines = list(c("Observations", nobs(didreg), nobs(did_control2), nobs(did_control3), nobs(did_control), nobs(did_control4)), 
                           c("$R2$", r2, r2c2, r2c3, r2c, r2c4)),
          type = "html", out = "bases_did.html", 
          notes = c("Results from OLS regressions with quarterly count of new investments in military bases as the outcome. Units are country-quarters, spanning the years 1997 to 2017. Robust standard  errors clustered by country in parentheses. Russia and Norway are coded as resource-dependent. The post-shock period begins in June 2007. Models 2-5 drop pre-2002 observations for Russia due to missing quarterly GDP data."))

# stargazer(didreg, did_control2, did_control3, did_control, #did_control4, 
#           dep.var.labels = "Bases New Investment", covariate.labels = covars, 
#           type = "html", out = "bases_did.html")


